Load preprocessed dataset (preprocessing code in data_preprocessing.Rmd)

Keep DE genes

datExpr = datExpr %>% filter(rownames(.) %in% rownames(DE_info)[DE_info$padj<0.05])
rownames(datExpr) = datGenes$feature_id[DE_info$padj<0.05 & !is.na(DE_info$padj)]
datGenes = datGenes %>% filter(feature_id %in% rownames(DE_info)[DE_info$padj<0.05])
DE_info = DE_info %>% filter(padj<0.05)

print(paste0('Keeping ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "Keeping 3000 genes and 80 samples"




Define a gene co-expression similarity

Using Biweight midcorrelation because it’s more robust to outliers than regular correlation or Mutual Information score

allowWGCNAThreads()
## Allowing multi-threading with up to 4 threads.
# MAD = median absolute deviation
cor_mat = datExpr %>% t %>% bicor

Clustering

clusterings = list()

Average linkage seems to be better at separating outliers than identifying modules

dend = cor_mat %>% as.dist %>% hclust(method='average')
plot(dend, hang = 0, labels=FALSE)

dend %>% as.dendrogram %>% set('labels', rep('', nrow(cor_mat))) %>% plot(ylim=c(-0.001,0.05))

dend = cor_mat %>% as.dist %>% hclust
plot(dend, hang = 0, labels=FALSE)

dend %>% as.dendrogram %>% set('labels', rep('', nrow(cor_mat))) %>% plot(ylim=c(0.5,1))
abline(h=0.75, col='blue')

best_k = 27
modules = cutree(dend, best_k)
names(modules) = datGenes$feature_id

clusterings[['cor_mat']] = modules

Merge similar modules

module_colors = viridis(best_k)

MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )

Merging Modules 4 and 19, and 8, 13, 15, 21, 22, 24, 27

bicor(MEs) %>% as.matrix %>% heatmap(scale='none', 
                                     col=rev(colorRampPalette(brewer.pal(9,'YlGnBu'), bias=3)(100)))

new_modules = modules %>% replace(modules==19, 4) %>% replace(modules %in% c(13, 15, 21, 22, 24, 27), 8)
names(new_modules) = datGenes$feature_id

clusterings[['cor_mat_merged']] = new_modules

rm(best_k, modules, new_modules, dend, MEs_output, MEs)

Correcting the correlation matrix from \(s \in [-1,1]\) to \(s \in [0,1]\). Two methods are proposed: \(s_{ij}=|bw(i,j)|\) and \(s_{ij}=\frac{1+bw(i,j)}{2}\)

-Using \(s_{ij}=\frac{1+bw(i,j)}{2}\), the strongest negative correlations (-1) get mapped to 0 (no correlation) and the zero correlated genes get mapped to the average correlation (0.5), which I don’t think makes much sense

-Using \(s_{ij}=|bw(i,j)|\) we lose the direction of the correlation, but at least we maintain the magnitude of the correlation of all the genes. Decided to use this one

S = abs(cor_mat)

Clustering

dend = S %>% as.dist %>% hclust
plot(dend, hang = 0, labels=FALSE)
abline(h=0.775, col='blue')

best_k = 21
modules = cutree(dend, best_k)
names(modules) = datGenes$feature_id

clusterings[['S']] = modules

Merge similar modules

module_colors = viridis(best_k)

MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )

Merging Modules (1, 2, 10 and 14), (6, 7, 16 and 19), (3 and 13), and (4 and 20)

bicor(MEs) %>% as.matrix %>% heatmap(scale='none', 
                                     col=rev(colorRampPalette(brewer.pal(9,'YlGnBu'), bias=3)(100)))

new_modules = modules %>% replace(modules %in% c(2, 10, 14), 1) %>% 
                          replace(modules %in% c(7, 16, 19), 6) %>%
                          replace(modules==13, 3) %>% replace(modules==20, 4)
names(new_modules) = datGenes$feature_id

clusterings[['S_merged']] = new_modules

rm(cor_mat, best_k, modules, module_colors, MEs_output, MEs, new_modules)




Define a family of adjacency functions

Chose power adjacency function over the sigmoid function because it has only one parameter to adjust and both methods are supposed to lead to very similar results if the parameters are chosen with the scale-free topology criterion.

Choosing a parameter value

Following the scale-free topology criterion because metabolic networks have been found to display approximate scale free topology

  1. Only consider those parameter values that lead to a network satisfying scale-free topology at least approximately, e.g. signed \(R^2 > 0.80\)
best_power = datExpr %>% t %>% pickSoftThreshold(powerVector = 1:15, RsquaredCut=0.8)
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k.  max.k.
## 1      1  0.00342  0.261          0.988 775.000  770.0000 1330.00
## 2      2  0.24500 -1.490          0.986 257.000  247.0000  636.00
## 3      3  0.58100 -2.220          0.963  98.200   90.3000  331.00
## 4      4  0.74900 -2.510          0.990  41.600   36.6000  186.00
## 5      5  0.83600 -2.560          0.990  19.200   15.8000  110.00
## 6      6  0.87300 -2.590          0.994   9.460    7.2500   67.80
## 7      7  0.90000 -2.520          0.994   4.960    3.4900   43.50
## 8      8  0.90800 -2.440          0.988   2.740    1.7600   28.80
## 9      9  0.88700 -2.380          0.951   1.590    0.9140   19.70
## 10    10  0.39100 -3.380          0.358   0.960    0.4870   13.70
## 11    11  0.38500 -3.210          0.349   0.603    0.2660    9.78
## 12    12  0.93500 -1.980          0.982   0.391    0.1490    7.11
## 13    13  0.95700 -1.830          0.985   0.262    0.0850    5.25
## 14    14  0.93300 -1.770          0.941   0.180    0.0495    3.98
## 15    15  0.94200 -1.780          0.964   0.126    0.0295    3.36
print(paste0('Best power for scale free topology: ', best_power$powerEstimate))
## [1] "Best power for scale free topology: 5"

Elevate the matrix to the suggested power

S_sft = S^best_power$powerEstimate

Clustering

dend = S_sft %>% as.dist %>% hclust
plot(dend, hang=0, labels=FALSE)
abline(h=0.28, col='blue')

best_k = 21
modules = cutree(dend, best_k)
names(modules) = datGenes$feature_id

clusterings[['S_sft']] = modules

Merge similar modules

module_colors = viridis(best_k)

MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )

Merging Modules (1, 2, 10 and 14), (6, 7, 16 and 19), (3 and 13), and (4 and 20) (same as with S)

bicor(MEs) %>% as.matrix %>% heatmap(scale='none', 
                                     col=rev(colorRampPalette(brewer.pal(9,'YlGnBu'), bias=3)(100)))

new_modules = modules %>% replace(modules %in% c(2, 10, 14), 1) %>% 
                          replace(modules %in% c(7, 16, 19), 6) %>%
                          replace(modules==13, 3) %>% replace(modules==20, 4)
names(new_modules) = datGenes$feature_id

clusterings[['S_sft_merged']] = new_modules

rm(dend, best_k, modules, module_colors, MEs_output, MEs, new_modules)



Additional considerations

  1. The mean connectivity should be high so that the network contains enough information

What does high mean? using the power=5 we get a mean connectivity of 20, is this high?

mean_connectivity = matrix(0, 15, 2) %>% data.frame
colnames(mean_connectivity) = c('power','mean_connectivity')
for(i in 1:15) mean_connectivity[i,] = c(i, mean(colSums(S^i)))

ggplotly(mean_connectivity %>% ggplot(aes(power, mean_connectivity)) + ylab('mean connectivity') +
         geom_vline(xintercept=best_power$powerEstimate, color='gray') + geom_point(color='#0099cc') + 
         scale_y_sqrt() + theme_minimal())
rm(mean_connectivity, best_power)
  1. The slope \(-\hat{\gamma}\) of the regression line between \(log_{10}(p(k))\) and \(log_{10}(k)\) should be around -1

Slope is 2.6 times as steep as it should be (maybe because the range of k is too narrow?)

k_df = data.frame('connectivity'=colSums(S_sft), 'bucket'=cut(colSums(S_sft), 10)) %>% group_by(bucket) %>%
       dplyr::summarize(p_k=n(), k=mean(connectivity)) %>% mutate(p_k=p_k/sum(p_k))

lm(log10(p_k) ~ log10(k), data=k_df)
## 
## Call:
## lm(formula = log10(p_k) ~ log10(k), data = k_df)
## 
## Coefficients:
## (Intercept)     log10(k)  
##       2.687       -2.670
rm(k_df)

Exploratory analysis of scale free topology adjacency matrix

  • Degree distribution: the scale-free topology adjacency matrix does have an exponential behaviour in the degree distribution which wasn’t there on the original matrix

Note: The slope is very steep, both axis are on sqrt scale

plot_data = data.frame('n'=1:nrow(S)^2, 'S'=sort(melt(S)$value), 'S_sft'=sort(melt(S_sft)$value))

plot_data %>% filter(n%%100==0) %>% dplyr::select(S, S_sft) %>% melt %>% 
              ggplot(aes(value, color=variable, fill=variable)) + geom_density(alpha=0.5) + 
              xlab('k') + ylab('p(k)') + scale_x_sqrt() + scale_y_sqrt() + theme_minimal()

rm(plot_data)
  • “To visually inspect whether approximate scale-free topology is satisfied, one plots log 10 (p(k)) versus log 10 (k). A straight line is indicative of scale-free topology”

The example given in the article has a curve similar to this one and they say it’s OK

k_df = data.frame('connectivity'=colSums(S_sft), 'bucket'=cut(colSums(S_sft), 10)) %>% group_by(bucket) %>%
      dplyr::summarize(p_k=n(), k=mean(connectivity)) %>% mutate(p_k=p_k/sum(p_k))

# k_df %>% ggplot(aes(k,p_k)) + geom_point(color='#0099cc') + geom_smooth(method='lm', se=FALSE, color='gray') + 
#          ylab('p(k)') + scale_x_log10() + scale_y_log10() + theme_minimal()

k_df %>% ggplot(aes(log10(k),log10(p_k))) + geom_point(color='#0099cc') + geom_smooth(method='lm', se=FALSE, color='gray') + 
         ylab('p(k)') + theme_minimal()

  • “To measure how well a network satisfies a scale-free topology, we propose to use the square of the correlation between log10(p(k)) and log10(k), i.e. the model fitting index \(R^2\) of the linear model that regresses log10(p(k)) on log10(k)”

\(R^2=0.83\) The \(R^2\) we got from the pickSoftThreshold function

lm(log10(p_k) ~ log10(k), data=k_df) %>% summary
## 
## Call:
## lm(formula = log10(p_k) ~ log10(k), data = k_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6508 -0.3561  0.1792  0.2841  0.4629 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.6865     0.6685   4.019 0.003848 ** 
## log10(k)     -2.6703     0.3955  -6.751 0.000145 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4222 on 8 degrees of freedom
## Multiple R-squared:  0.8507, Adjusted R-squared:  0.832 
## F-statistic: 45.58 on 1 and 8 DF,  p-value: 0.0001449
rm(k_df)




Defining a measure of node dissimilarity

Using topological overlap dissimilarity measure because it has been found to result in biologically meaningful modules

Create Topological Overlap Matrix (TOM)

1st quartile is already 0.9852, most of the genes are very dissimilar

TOM = S_sft %>% TOMsimilarity
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
dissTOM = 1-TOM

rownames(dissTOM) = rownames(S_sft)
colnames(dissTOM) = colnames(S_sft)

dissTOM %>% melt %>% summary
##               Var1                      Var2             value       
##  ENSG00000000971:   3000   ENSG00000000971:   3000   Min.   :0.0000  
##  ENSG00000001084:   3000   ENSG00000001084:   3000   1st Qu.:0.9852  
##  ENSG00000001626:   3000   ENSG00000001626:   3000   Median :0.9918  
##  ENSG00000002586:   3000   ENSG00000002586:   3000   Mean   :0.9882  
##  ENSG00000002746:   3000   ENSG00000002746:   3000   3rd Qu.:0.9956  
##  ENSG00000002933:   3000   ENSG00000002933:   3000   Max.   :0.9999  
##  (Other)        :8982000   (Other)        :8982000

Clustering TOM

rownames(TOM) = rownames(S_sft)
colnames(TOM) = colnames(S_sft)

dend = TOM %>% as.dist %>% hclust
plot(dend, hang=0, labels=FALSE)
abline(h=0.14, col='blue')

best_k = 19
modules = cutree(dend, best_k)
names(modules) = datGenes$feature_id

clusterings[['TOM']] = modules

Merge similar modules

module_colors = viridis(best_k)

MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )

Merging Modules (2, 3, 4 and 8)

bicor(MEs) %>% as.matrix %>% heatmap(scale='none', 
                                     col=rev(colorRampPalette(brewer.pal(9,'YlGnBu'), bias=3)(100)))

new_modules = modules %>% replace(modules %in% c(3, 4, 8), 2)
names(new_modules) = datGenes$feature_id

clusterings[['TOM_merged']] = new_modules

rm(S, S_sft, TOM, dend, best_k, modules, module_colors, MEs_output, MEs, new_modules)




Identifying gene modules

Using hierarchical clustering on the TOM-based dissimilarity matrix

Using average linkage (following the paper)

In average linkage hierarchical clustering, the distance between two clusters is defined as the average distance between each point in one cluster to every point in the other cluster.

It’s not easy to determine which number of clusters is the optimal

dend = dissTOM %>% as.dist %>% hclust(method='average')
plot(dend, hang=0, labels=FALSE)

Zooming in on the root of the dendrogram you can see that it just separates outliers and it only separates the data into two big groups until farther down. And even after this, it looks like it continues to filter out outliers almost one by one. I don’t think this is ideal…

dend %>% as.dendrogram %>% set('labels', rep('', nrow(dissTOM))) %>% plot(ylim=c(0.99,1))

Using complete linkage

In complete linkage hierarchical clustering, the distance between two clusters is defined as the longest distance between two points in each cluster.

It’s not easy to see where you should do the cut

dend = dissTOM %>% as.dist %>% hclust
plot(dend, hang=0, labels=FALSE)

Zooming in on the beginning of the dendrogram it seems like maybe 14 is a good number of clusters? Although its quite arbitrary…

dend %>% as.dendrogram %>% set('labels', rep('', nrow(dissTOM))) %>% plot(ylim=c(0.999,1))
abline(h=0.99943, col='blue')

Looks balanced…

best_k = 23

dend %>% as.dendrogram %>% set('labels', rep('', nrow(dissTOM))) %>% set('branches_k_color', k=best_k) %>% plot

modules = cutree(dend, best_k)
names(modules) = datGenes$feature_id

clusterings[['dissTOM']] = modules

table(modules)
## modules
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 205  72 297 107 208 153  96   2 267 228 326  69 102 191 147  75  27 113 
##  19  20  21  22  23 
##  80  57  46  19 113




Merging modules with similar expression profiles

“One can relate modules to each other by correlating the corresponding module eigengenes (Horvath et al., 2005). If two modules are highly correlated, one may want to merge them”

Calculate the “eigengenes” (1st principal component) of each module

module_colors = viridis(best_k)

MEs_output = datExpr %>% t %>% moduleEigengenes(colors=module_colors[as.vector(modules[labels(dend)])])
MEs = MEs_output$eigengenes
colnames(MEs) = sapply(colnames(MEs), function(x) paste0('Mod',which(module_colors == substring(x,3))) )

Merging Modules 3, 5, 6, 9, 10, 11 and 14

bicor(MEs) %>% as.matrix %>% heatmap(scale='none', 
                                     col=rev(colorRampPalette(brewer.pal(9,'YlGnBu'), bias=3)(100)))

new_modules = modules %>% replace(modules %in% c(5,6,9,10,11,14), 3)

names(new_modules) = datGenes$feature_id

clusterings[['dissTOM_merged']] = new_modules
viridis_colors = viridis(best_k)

dend_colors = data.frame('ID'=names(modules[labels(dend)]),
                         'OriginalModules' = viridis_colors[as.vector(modules[labels(dend)])],
                         'MergedModules' = viridis_colors[as.vector(new_modules[labels(dend)])])

dend %>% as.dendrogram %>% set('labels', rep('', nrow(dissTOM))) %>% set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_colors[,-1])

rm(new_modules, MEs, dend, best_k, modules, module_colors, MEs_output, MEs, new_modules, dend_colors, viridis_colors)
## Warning in rm(new_modules, MEs, dend, best_k, modules, module_colors,
## MEs_output, : object 'MEs' not found
## Warning in rm(new_modules, MEs, dend, best_k, modules, module_colors,
## MEs_output, : object 'new_modules' not found




Exploratory analysis of the different clusterings

Adjusted Rand Index comparison

cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
  cluster1 = as.factor(clusterings[[i]])
  for(j in (i):length(clusterings)){
    cluster2 = as.factor(clusterings[[j]])
    cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
  }
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)

cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = F, Colv = F, dendrogram = 'none', 
          cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE, 
          cexRow = 1, cexCol = 1, margins = c(8,8))

rm(i, j, cluster1, cluster2, cluster_sim)

PCA

pca = datExpr %>% prcomp

plot_data = data.frame('ID' = rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'PC3' = pca$x[,3],
                       'cor_mat' = clusterings[['cor_mat']], 'cor_mat_merged' = clusterings[['cor_mat_merged']],
                       'S' = clusterings[['S']], 'S_merged' = clusterings[['S_merged']],
                       'S_sft' = clusterings[['S_sft']], 'S_sft_merged' = clusterings[['S_sft_merged']],
                       'TOM' = clusterings[['TOM']], 'TOM_merged' = clusterings[['TOM_merged']],
                       'dissTOM' = clusterings[['dissTOM']], 'dissTOM_merged' = clusterings[['dissTOM_merged']])

selectable_scatter_plot(plot_data[,-1], plot_data[,-1])
#ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=OriginalModules)) + geom_point(alpha=0.5) + theme_minimal())

rm(pca, plot_data)